Basic Data Management


In [ ]:
manager <- c(1, 2, 3, 4, 5)
date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09")
country <- c("US", "US", "UK", "UK", "UK")
gender <- c("M", "F", "F", "M", "F")
age <- c(32, 45, 25, 39, 99)
q1 <- c(5, 3, 3, 3, 2)
q2 <- c(4, 5, 5, 3, 2)
q3 <- c(5, 2, 5, 4, 1)
q4 <- c(5, 5, 5, NA, 2)
q5 <- c(5, 5, 2, NA, 1)
leadership <- data.frame(manager, date, country, gender, age,
                         q1, q2, q3, q4, q5, stringsAsFactors=FALSE)

In [ ]:
leadership

Creating new variables


In [ ]:
# Preferred way use transform()
mydata<-data.frame(x1 = c(2, 2, 6, 4),
                   x2 = c(3, 4, 2, 8))
mydata

In [ ]:
mydata <- transform(mydata, sumx = x1 + x2, meanx = (x1 + x2)/2)
mydata

recode variables

There are some packages:

  1. car's recode() function.
  2. doBy's recodeVar() function.

In [ ]:
leadership$age[leadership$age == 99] <- NA

In [ ]:
leadership

In [ ]:
leadership <- within(leadership, {
    agecat <- NA
    agecat[age > 75] <- "Elder"
    agecat[age >= 55 & age <= 75] <- "Middle Aged"
    agecat[age < 55]              <- "Young"
}
)
leadership

Change name


In [ ]:
# Can use plyr rename() function
# rename(dataframe, c(oldname="newname", oldname="newname",...))

In [ ]:
names(leadership)

In [ ]:
names(leadership)[2] <- "testDate"

In [ ]:
leadership

Missing values


In [ ]:
# NA is not comparable, even to itself
# is.nan() and is.infinite() to test not a number and infinite
# some functions, e.g. sum can set sum(na.rm = TRUE)
# use na.omit(dataframe) to remove any data with NA

In [ ]:
y <- c(1, 2, 3, NA)
is.na(y)

In [ ]:
is.nan(y)

In [ ]:
is.na(leadership[,6:10])

In [ ]:
is.nan(sin(Inf))

In [ ]:
is.infinite(Inf)

In [ ]:
is.infinite(sin(Inf))

In [ ]:
x <- c(1, 2, NA, 3)
sum(x)

In [ ]:
sum(x, na.rm = TRUE)

In [ ]:
leadership

In [ ]:
newdata <- na.omit(leadership)
newdata

Dates


In [ ]:
# Useful functions
# as.Date(str, format)
# Sys.Date()
# date()
# format(Dateobject, format string)
# help(as.Date), help(strftime)
# package: lubridate, timeDate

In [ ]:
mydates <- as.Date(c("2007-06-22", "2004-02-13"))

In [ ]:
mydates

In [ ]:
strDates <- c("01/05/1965", "08/16/1975")
dates <- as.Date(strDates, "%m/%d/%Y")

In [ ]:
dates

In [ ]:
format <- "%m/%d/%y"
leadership$testDate <- as.Date(leadership$testDate, format)
leadership

In [ ]:
Sys.Date()

In [ ]:
date()

In [ ]:
today <- Sys.Date()
format(today, format = "%B %d %Y")

In [ ]:
format(today, format = "%A")

In [ ]:
endDate <- as.Date("2017-01-13")
startDate <- as.Date("1985-07-29")
days <- endDate - startDate
days

In [ ]:
difftime(startDate, endDate, units = "week")

Type conversion


In [ ]:
# [is|as].[numeric|character|vector|matrix|data.frame|factor|logic]

In [ ]:
a <- c(1, 2, 3)
is.numeric(a)

In [ ]:
is.vector(a)

In [ ]:
a <- as.character(a)
a

In [ ]:
print(is.numeric(a))
print(is.vector(a))
print(is.character(a))

Sorting data


In [ ]:
attach(leadership)
newdata <- leadership[order(gender, age),]
detach(leadership)
newdata

In [ ]:
attach(leadership)
newdata <- leadership[order(gender, -age),]
detach(leadership)
newdata

Merge dataset


In [ ]:
# merge: total <- merge(dataframeA, dataframeB, by=c("ID","Country"))
# cbind: total <- cbind(A, B)
# rbind: total <- rbind(dataframeA, dataframeB)

Subsetting data


In [ ]:
# select column
# select observation
# subset() function
# sample() function to random sample, sampling and survey package

In [ ]:
newdata <- leadership[, c(6:10)]
newdata

In [ ]:
myvars <- c("q1", "q2", "q3", "q4", "q5")
newdata <- leadership[, myvars]
newdata

excluding


In [ ]:
myvars <- names(leadership) %in% c("q3", "q4")
myvars

In [ ]:
newdata <- leadership[!myvars]
newdata

In [ ]:
newdata <- leadership[,-c(8, 9)]
newdata

select observations


In [ ]:
leadership$testDate <- as.Date(leadership$testDate, "%m/%d/%y")
startdate <- as.Date("2009-01-01", format = "%Y-%m-%d")
enddate <- as.Date("2009-10-31")
print(startdate)

In [ ]:
leadership

In [ ]:
leadership[leadership$testDate >= startdate,]

subset()


In [ ]:
newdata <- subset(leadership, age > 24 & age <= 35, select = age:q5)
newdata

In [ ]:
newdata <- subset(leadership, gender == "M" & age <= 35, select = country:q5)
newdata

sample()


In [ ]:
newdata <- leadership[sample(1:nrow(leadership), 3, replace = FALSE),]
newdata